

cap log close
log using "$log/04_str_conn_input_BeH_skill_ver${ver}_${S_DATE}", text replace

********************************************************************************
*
* This program creates an input file of moves that are used to find the 
* largest strongly connected set of establishment-years (for tvakm). 
*  
* The main input of this file is movers_ver$ver.dta which is a person-estab
* level file recording all moves. The file output is 
* flows which is analyzed using the BGL functions in Matlab to extract
* the largest strongly connected set.
********************************************************************************


use "$temp/movers_ver${ver}.dta", clear

merge m:1 persnr spell using "$temp\BeH_large", keepusing(beh_beruf_num) nogen keep(1 3)
merge m:1 beh_beruf_num using "$temp/high_skill_cleaned_obs_all", keepusing(high_skill) nogen keep(1 3)
rename high_skill high_skill_current
rename beh_beruf_num beh_beruf_num_current

rename spell spell_orig
rename tospell spell
merge m:1 persnr spell using "$temp\BeH_large", keepusing(beh_beruf_num) nogen keep(1 3)
merge m:1 beh_beruf_num using "$temp/high_skill_cleaned_obs_all", keepusing(high_skill) nogen keep(1 3)
rename high_skill high_skill_to
rename beh_beruf_num beh_beruf_num_to
rename spell tospell
rename spell_orig spell


*This file now contains the set of moves over which we would like to find
*the largest strongly connected set. 

rename currentid betnr

*Merge with ee, en rates by sending firm and year
merge m:1 betnr year q using "$results/estab_eeenrates_byyr_ver$ver.dta"
tab _merge

replace _merge=3 if betnr==1 //non-employment does not have estab stats
tab _merge
drop if _merge == 1 // no estimated eeenrates_byyr, either small establishment that we dropped or singelton obs (birth and death or dropped)
tab _merge
keep if _merge==3 // merge=2 establishments without observed flow, rate imputed
drop _merge

rename betnr currentid 

* code is removing firms that do not hire from N
preserve
keep if currentid==1
keep toid
rename toid currentid
duplicates drop
save "$temp/temp", replace
restore

merge m:1 currentid using "$temp/temp"
replace _merge = 3 if currentid==1
tab _merge
keep if _merge == 3
drop _merge
erase "$temp/temp.dta"

assert !missing(eerate_avgall) if currentid != 1
assert !missing(enrate_avgall) if currentid != 1

******************

keep currentid toid year q high_skill_current high_skill_to
rename currentid betnr

*Create firm-skill ID variable for sender and receiver
gen double currentid = (betnr * 10 + high_skill_current)
rename betnr betnr_current 
rename toid betnr_to
gen double toid = (betnr_to * 10 + high_skill_to)

replace currentid = currentid + 1
replace toid = toid + 1

replace currentid=1 if betnr_current==1
replace toid=1 if betnr_to==1

format currentid %12.0g
format toid %12.0g

drop if missing(currentid)
drop if missing(toid)

* code is removing firms that do not hire from N
preserve
keep if currentid==1
keep toid
rename toid currentid
duplicates drop
save "$temp/temp", replace
restore

merge m:1 currentid using "$temp/temp"
replace _merge = 3 if currentid==1
tab _merge
keep if _merge == 3
drop _merge
erase "$temp/temp.dta"



keep betnr_current betnr_to currentid toid year q
compress
save "$temp/str_conn_input_skill_ver$ver.dta", replace


*-------------------------------------------------------------------------------
*Part2: prepare the set of establishments to determine the largest strongly 
*connected set
*-------------------------------------------------------------------------------



*Create id's corresponding to matrix entries for adjacency matrix
rename currentid a1
rename toid a2
gen double i = _n
reshape long a, i(i) j(j)
keep a
duplicates drop 
gen double estabid = a
sort estabid
gen double matrix_estabid = _n
count
compress
save "$temp/estab_matrix_ids_skill_ver$ver.dta", replace
save "$temp/estab_matrix_ids_orig_skill_ver$ver.dta", replace


*Re-input the trimmed movers file
use $temp/str_conn_input_skill_ver$ver, clear

*Get sender matrix id
rename currentid estabid
merge m:1 estabid using "$temp/estab_matrix_ids_skill_ver$ver.dta"
keep if _merge==3 // _merge == 3: estabid with out toid
drop _merge a

*Obtain the list of senders that meet all the restrictions from part 1
preserve
	keep estabid matrix_estabid
	duplicates drop
	count
	save "$temp/restricted_senders_skill_ver$ver.dta", replace
restore

rename estabid currentid
rename matrix_estabid matrix_currentid

*Get receiver matrix id 
rename toid estabid
merge m:1 estabid using "$temp/estab_matrix_ids_skill_ver$ver"
keep if _merge==3
drop _merge

*Throw out receivers that are not in the sender list
*Note: this step is crucial as it makes the flows matrix square!
merge m:1 estabid using $temp/restricted_senders_skill_ver$ver
tab _merge
*_merge==1 is a receiving establishment that are not in the restricted sender list
*_merge==2 is are estabs in the restricted sender list that are not receivers
keep if _merge==3
drop _merge a
rename estabid toid
rename matrix_estabid matrix_toid
keep matrix*

*Flows
gen flows = 1 //not actual flows for NE 

*Make matrix square
sum matrix_currentid, d
local mxcurr=r(max)
sum matrix_toid, d
local mxto=r(max)
local obstoadd = abs(`mxcurr'-`mxto')
quietly{
preserve
	clear
	set obs `obstoadd'
	gen double matrix_currentid=. 
	gen double matrix_toid=.
	gen flows=0
	forvalues i = 1/`obstoadd' {
		local id = `mxto'+`i'
		replace matrix_currentid=`id' if _n==`i'
		replace matrix_toid=`id' if _n==`i'
	}
	tempfile squareobs
	save `squareobs', replace
restore
}
append using `squareobs'


*Collapse by sender-receiver pairs
collapse (sum) flows, by(matrix_currentid matrix_toid)
label var flows "flows"

compress
export delimited using "$matlab\sconn_idfy\flows_skill_ver$ver.csv", datafmt   replace

cap log close
